Authors:

Group 3

Jessica Bormann Sophie Elting Casper van Tongeren

Layout

This document will consist of all data used in our pressure cooker. First we will use the code provided by Steffen Greup, to set up and load the raw data files and do some minor data transformations. After that we will add the data of Steffen and Abe, showing us a basic first model. Then we will do some of our own data exploration. Finally we will start constructing our own models and comparing their effectiveness to generalize to new data

1. Data loading and preparation

rm(list = ls()); graphics.off()

#' - in each comment blok the links refer to websites where you can download the files needed: #nolint
#' - all variable names are in Dutch; you can rename these yourself. In your final presentation #nolint
#'  make sure the client know about what variables your are talking in your report #nolint

###########################################################################
# Populatie en adres (location details)
# https://duo.nl/open_onderwijsdata/databestanden/po/adressen/adressen-po-3.jsp
# https://duo.nl/open_onderwijsdata/images/03-alle-vestigingen-bo.csv
###########################################################################

d <- readr::read_csv2("data/input/03-alle-vestigingen-bo.csv") %>%
  janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 6215 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (29): PROVINCIE, BRIN NUMMER, VESTIGINGSNUMMER, VESTIGINGSNAAM, STRAATNA...
## dbl  (1): BEVOEGD GEZAG NUMMER
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- d %>%
  dplyr::select(vestigingsnummer, postcode,
                plaatsnaam, gemeentenaam,
                gemeentenummer, provincie)

d <- d %>%
  dplyr::mutate(brin_nummer = stringr::str_sub(vestigingsnummer, 1, 4),
                vestigingsnummer = stringr::str_sub(vestigingsnummer, 5, -1))

###########################################################################
# Aantal leerlingen (number of students)
# https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/po-totaal/leerlingen-po-6.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/06-historische-data-leerlingaantallen-po.xlsx #nolint
###########################################################################

d_n_lln <- readr::read_csv2("data/input/06-historische-data-leerlingaantallen-po.csv", #nolint
                            guess_max = 10000) %>%
                            janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 9539 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## chr (32): BRIN_NUMMER, VESTIGINGSNUMMER, INSTELLINGSNAAM_VESTIGING, VEST_SOO...
## dbl  (3): BEVOEGD_GEZAG_NUMMER, PEILDATUM, LAATSTE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d_n_lln <- d_n_lln %>%
  dplyr::select(brin_nummer, vestigingsnummer,
                type_po, dplyr::starts_with("leerlingen_")) %>%
  dplyr::filter(type_po == "BO") %>%
  dplyr::select(-type_po)

d_n_lln <- d_n_lln %>%
  tidyr::pivot_longer(cols = dplyr::starts_with("leerlingen_"),
                      names_to = "jaar",
                      values_to = "aantal_leerlingen",
                      names_prefix = "leerlingen_") %>%
  dplyr::mutate(jaar = as.integer(jaar),
                schooljaar = paste(jaar, jaar + 1, sep = "-"))

###########################################################################
# Toetsresultaten, gemeten in behaalde referentieniveaus (test results)
# https://duo.nl/open_onderwijsdata/databestanden/po/leerlingen-po/bo-sbo/refniveau.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2018-2019.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2017-2018.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2016-2017.csv #nolint
# https://duo.nl/open_onderwijsdata/images/10-leerlingen-bo-referentieniveaus-2015-2016.csv #nolint
###########################################################################

d_toets <- list.files("data/input/referentieniveaus/",
                      full.names = TRUE, pattern = ".csv") %>%
  lapply(., FUN = readr::read_csv2,
         col_types = cols(VESTIGINGSNUMMER = col_character())) %>%
  dplyr::bind_rows() %>%
  janitor::clean_names()
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
d_toets <- d_toets %>%
  dplyr::mutate(vestigingsnummer = stringr::str_pad(vestigingsnummer,
                                                    width = 2, side = "left", pad = "0")) #nolint

d_toets <- d_toets %>%
  dplyr::select(peildatum, brin_nummer,
                vestigingsnummer,
                dplyr::ends_with("1f"),
                dplyr::ends_with("2f"), rekenen_1s) %>%
  dplyr::mutate(rekenen_2f = rekenen_1s + rekenen_2f) %>%
  dplyr::select(-rekenen_1s) %>%
  dplyr::mutate(peiljaar = as.integer(stringr::str_sub(as.character(peildatum), 1, 4)), #nolint
                schooljaar = paste(peiljaar - 1, peiljaar, sep = "-")) %>%
  dplyr::select(-peildatum, -peiljaar)

d_toets <- d_toets %>%
  dplyr::mutate(n_observaties = rekenen_lager1f + rekenen_1f
                                + rekenen_2f + lv_lager1f
                                + lv_1f + lv_2f + tv_lager1f
                                + tv_1f + tv_2f)

d_toets <- d_toets %>%
  dplyr::mutate(perc_2F = 100 * (rekenen_2f + lv_2f + tv_2f) / n_observaties)

d_toets <- d_toets %>%
  dplyr::select(brin_nummer, vestigingsnummer,
                schooljaar, n_observaties, perc_2F)

###########################################################################
# Schoolweging toevoegen (what does this mean?)
#
# https://www.cbs.nl/nl-nl/maatwerk/2018/46/gemiddelde-verwachte-schoolscores-2015-2017 #nolint
# https://www.cbs.nl/-/media/_excel/2018/46/181109-gemiddelde-verwachte-schoolscores-def.xlsx #nolint
#
# https://www.cbs.nl/nl-nl/maatwerk/2021/12/schoolweging-2018-2020
# https://www.cbs.nl/-/media/_excel/2021/12/schoolweging-2018-2020.xlsx
###########################################################################

d_weging_2015 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
                                   sheet = "Tabel 1", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2015-2016") %>%
  dplyr::rename(schoolweging = gemiddelde_score)

d_weging_2016 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
                                   sheet = "Tabel 2", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2016-2017") %>%
  dplyr::rename(schoolweging = gemiddelde_score)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C6313 / R6313C3: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in D6313 / R6313C4: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in E6313 / R6313C5: got '.'
d_weging_2017 <- readxl::read_xlsx("data/input/181109 Gemiddelde verwachte schoolscores-def.xlsx", #nolint
                                   sheet = "Tabel 3", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2017-2018") %>%
  dplyr::rename(schoolweging = gemiddelde_score)
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in C6327 / R6327C3: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in D6327 / R6327C4: got '.'
## Warning in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
## Expecting numeric in E6327 / R6327C5: got '.'
d_weging_2018 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
                                   sheet = "Tabel 1", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2018-2019")

d_weging_2019 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
                                   sheet = "Tabel 2", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2019-2020")

d_weging_2020 <- readxl::read_xlsx("data/input/schoolweging-2018-2020.xlsx",
                                   sheet = "Tabel 3", skip = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(schooljaar = "2020-2021")

d_weging <- dplyr::bind_rows(d_weging_2015, d_weging_2016, d_weging_2017,
                             d_weging_2018, d_weging_2019, d_weging_2020) %>%
  dplyr::select(brin, vestiging, schooljaar,
                schoolweging, schoolweging_spreiding = spreiding) %>%
  tidyr::drop_na(brin)

rm(d_weging_2015, d_weging_2016, d_weging_2017,
   d_weging_2018, d_weging_2019, d_weging_2020)

d_weging <- d_weging %>%
  dplyr::rename(brin_nummer = brin,
                vestigingsnummer = vestiging)

###########################################################################
# Add open data from CBS, based on postalcodes
###########################################################################

library(cbsodataR)

# all_tables <- cbs_get_toc() #nolint
#
# kerncijfers <- all_tables %>%
# dplyr::filter(stringr::str_detect(ShortTitle, pattern = 'Kerncijfers')) #nolint

if (isFALSE(file.exists("data/input/cbs_open_data.rds"))) {

  vraag_cbs_data_op <- function(tabelnummer, jaar) {

    x <- cbs_get_data(tabelnummer, select = c("SoortRegio_2",
                                              "Codering_3",
                                              "AantalInwoners_5",
                                              "k_0Tot15Jaar_8",
                                              "Bevolkingsdichtheid_33"))

    x <- x %>%
      dplyr::mutate(dplyr::across(where(is.character), stringr::str_squish)) %>%
      dplyr::filter(SoortRegio_2 == "Gemeente") %>%
      dplyr::select(-SoortRegio_2) %>%
      dplyr::mutate(jaar = jaar)

    return(x)

  }

  d_cbs <- mapply(FUN = vraag_cbs_data_op,
                  tabelnummer = c("83487NED", "83765NED", "84286NED",
                                  "84583NED", "84799NED", "85039NED"),
                  jaar = 2016:2021,
                  SIMPLIFY = FALSE) %>%
    dplyr::bind_rows()

  saveRDS(d_cbs, "data/input/cbs_open_data.rds")

} else {

  d_cbs <- readRDS("data/input/cbs_open_data.rds")

}

d_cbs <- d_cbs %>%
  dplyr::rename(gemeentenummer                     = Codering_3,
                gemeente_aantal_inwoners           = AantalInwoners_5,
                gemeente_aantal_inwoners_0_15_jaar = k_0Tot15Jaar_8,
                gemeente_bevolkingsdichtheid       = Bevolkingsdichtheid_33)

d_cbs <- d_cbs %>%
  dplyr::mutate(gemeentenummer = stringr::str_sub(gemeentenummer, 3, -1),
                gemeente_perc_inwoners_0_tot_15_jaar = 100 * gemeente_aantal_inwoners_0_15_jaar / gemeente_aantal_inwoners, #nolint
                schooljaar = paste(jaar - 1, jaar, sep = "-")) %>%
  dplyr::select(gemeentenummer, schooljaar, gemeente_aantal_inwoners,
                gemeente_perc_inwoners_0_tot_15_jaar,
                gemeente_bevolkingsdichtheid)

###########################################################################
# Personeel (staff)
###########################################################################

#############################
# More features
# https://duo.nl/open_onderwijsdata/databestanden/po/onderwijspersoneel/po-personeel2.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/02-onderwijspersoneel-po-in-fte-2011-2020.xlsx #nolint
#############################

d_personeel <- readxl::read_xlsx("data/input/02-onderwijspersoneel-po-in-fte-2011-2020.xlsx", #nolint
                                 sheet = "per owtype-bestuur-brin-functie") %>%
  janitor::clean_names() %>%
  dplyr::select(-onderwijstype, -bevoegd_gezag)

d_personeel <- d_personeel %>%
  dplyr::mutate(functiegroep = dplyr::case_when(
    functiegroep == "Directie" ~ "dir",
    functiegroep == "Leraren in opleiding (LIO)" ~ "lio",
    functiegroep == "Onbekend" ~ "onbekend",
    functiegroep == "Onderwijsgevend personeel" ~ "op",
    functiegroep == "Onderwijsondersteunend personeel (OOP/OBP)" ~ "oop_obp",
  ))

# wanneer er geen data is staat er bij de gemiddeldes 0 ipv een missing
d_personeel <- d_personeel %>%
  dplyr::mutate(across(dplyr::starts_with("gemiddelde_"),
                       ~ifelse(. == 0, yes = NA_real_, no = .)))

d_personeel <- d_personeel %>%
  dplyr::filter(stringr::str_to_lower(brin_nummer) != "bovenschools")

d_personeel <- d_personeel %>%
  tidyr::pivot_longer(cols = -c(brin_nummer, functiegroep),
                      names_to = "variabele_en_jaar",
                      values_to = "waarde") %>%
  dplyr::mutate(jaar      = stringr::str_sub(variabele_en_jaar,
                                             start = -4, end = -1),
                jaar      = as.integer(jaar),
                variabele = stringr::str_sub(variabele_en_jaar,
                                             start = 1, end = -6)) %>%
  dplyr::select(-variabele_en_jaar)

d_personeel <- d_personeel %>%
  tidyr::pivot_wider(names_from = variabele,
                     values_from = waarde)

d_personeel_a <- d_personeel %>%
  dplyr::filter(functiegroep == "op") %>%
  dplyr::mutate(perc_man = 100 * ftes_mannen / ftes,
                perc_tijdelijk_contract = 100 * ftes_personen_in_tijdelijke_dienst / ftes) %>% #nolint
  dplyr::select(brin_nummer, jaar, ftes,
                gemiddelde_aanstelling = gemiddelde_ftes,
                gemiddelde_leeftijd, perc_man, perc_tijdelijk_contract)

# gemiddelde_ftes, # gemiddelde_leeftijd

d_personeel_b <- d_personeel %>%
  dplyr::filter(functiegroep == "oop_obp") %>%
  dplyr::select(brin_nummer, jaar, ftes_ondersteunend = ftes)

d_personeel_tot <- d_personeel_a %>%
  dplyr::left_join(d_personeel_b, by = c("brin_nummer", "jaar")) %>%
  dplyr::mutate(ratio_leraar_ondersteuner = ftes / ftes_ondersteunend,
                schooljaar = paste(jaar, jaar + 1, sep = "-")) %>%
  dplyr::select(-ftes, -ftes_ondersteunend, -jaar)

rm(d_personeel_a, d_personeel_b)

#############################
# leerling-leraar ratio's
# (teacher-student ratio)
# https://duo.nl/open_onderwijsdata/databestanden/po/onderwijspersoneel/po-personeel3.jsp #nolint
# https://duo.nl/open_onderwijsdata/images/03-leerling-leraarratio-po-per-instelling.xlsx #nolint
#############################

d_leraar_leerling <- readxl::read_xlsx("data/input/03-leerling-leraarratio-po-per-instelling.xlsx", #nolint
                                       sheet = "7. ratios-regio-bestuur-brin") %>% #nolint
  janitor::clean_names()

d_leraar_leerling <- d_leraar_leerling %>%
  dplyr::filter(o == "BAO") %>%
  dplyr::select(brin_nummer = brin, dplyr::starts_with("ratio_"))

d_leraar_leerling <- d_leraar_leerling %>%
  tidyr::pivot_longer(cols = dplyr::starts_with("ratio_"),
                      names_to = "jaar",
                      values_to = "ratio_leerling_leraar",
                      names_prefix = "ratio_") %>%
  dplyr::mutate(jaar = as.integer(jaar),
                schooljaar = paste(jaar, jaar + 1, sep = "-")) %>%
  dplyr::select(-jaar)

###################
# Combineren (combine data)
# Ik beperk de dataset nu tot de jaren waar ook de toetsresultaten beschikbaar zijn #nolint
# (this join makes a subset of data based on availability in d_toets)
# Sommige databronnen hebben ook oudere gegevens, die worden nu dus niet meegenomen. #nolint
###################

d_tot <- d_toets %>%
  dplyr::left_join(d,                 by = c("brin_nummer", "vestigingsnummer")) %>% #nolint
  dplyr::left_join(d_cbs,             by = c("gemeentenummer", "schooljaar")) %>% #nolint
  dplyr::left_join(d_leraar_leerling, by = c("brin_nummer", "schooljaar")) %>% #nolint
  dplyr::left_join(d_personeel_tot,   by = c("brin_nummer", "schooljaar")) %>% #nolint
  dplyr::left_join(d_n_lln,           by = c("brin_nummer", "vestigingsnummer", "schooljaar")) %>% #nolint
  dplyr::left_join(d_weging,          by = c("brin_nummer", "vestigingsnummer", "schooljaar")) #nolint

Simple example model

Now that all the data is loaded and a combined dataframe is made, we can start to build a very simple model. Here, an example by Steffen is provided:

#############################
# Voorbeeldmodelletje om uitkomstvariabele en correctie
# voor leerlingpopulatie (schoolweging) te laten zien
# (play model provided by the client)
#############################

d_tot <- d_tot %>%
  dplyr::mutate(prop_2F = perc_2F / 100)

mod1 <- glm(prop_2F ~ scale(schoolweging),
            weights = n_observaties,
            family  = binomial(),
            data    = d_tot)

predict(mod1, newdata = tibble(schoolweging = c(25, 35)), type = "response")
##         1         2 
## 0.6682653 0.4974504

2. Further data prep and more advanced models

Up next we use play code 1, also provided by Steffen. This code shows an example of how to make a model that uses differences in Perc_2F as an outcome variable for a model using various predictors

# ------------------------------------------
# play code to get you started
# needed opbjects:
# - d_toets should be available in the workspace after running make_dataset.
# - d_personeel_tot should be available in the workspace after running make_dataset. #nolint
# [it might be that this is not the best way to prepare your dataset!]

# here I reshape to wide format and calculate three diff variables:
tmp_file <- d_toets %>%
  select(-n_observaties) %>%
  pivot_wider(names_from = schooljaar, values_from =  perc_2F) %>%
  mutate(diff1 = `2016-2017` - `2015-2016`,
         diff2 = `2017-2018` - `2016-2017`,
         diff3 = `2018-2019` - `2017-2018`)

# some quick inspections; look very normal so lets do some linear modelling
tmp_file %>%
  select(diff1, diff2, diff3) %>%
  ggpairs(aes(alpha = .1))
## Warning: Removed 387 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 582 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 720 rows containing missing values
## Warning: Removed 582 rows containing missing values (geom_point).
## Warning: Removed 513 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 662 rows containing missing values
## Warning: Removed 720 rows containing missing values (geom_point).
## Warning: Removed 662 rows containing missing values (geom_point).
## Warning: Removed 589 rows containing non-finite values (stat_density).

# first model
# is the growth related to the scores on 20217-2018?
# if yes and negative effect, this could indicate regr towards the mean.
mod <- lm(diff3 ~ scale(`2017-2018`), data = tmp_file) # regr towards the mean!
summary(mod)
## 
## Call:
## lm(formula = diff3 ~ scale(`2017-2018`), data = tmp_file)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.753  -7.777   0.650   8.242  48.370 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.8808     0.1586   5.553 2.92e-08 ***
## scale(`2017-2018`)  -8.6848     0.1597 -54.367  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.3 on 6011 degrees of freedom
##   (589 observations deleted due to missingness)
## Multiple R-squared:  0.3296, Adjusted R-squared:  0.3295 
## F-statistic:  2956 on 1 and 6011 DF,  p-value: < 2.2e-16
# now add some more data:
tmp_file <- d_personeel_tot %>%
  filter(schooljaar == "2017-2018") %>%
  right_join(tmp_file, by = c("brin_nummer"))

# now add some more data:
tmp_file <- d_weging %>%
  filter(schooljaar == "2017-2018") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))

# extend the model
mod <- lm(diff3 ~ 1
          + scale(`2017-2018`)
          + scale(schoolweging)
          + scale(perc_man)
          + scale(gemiddelde_aanstelling)
          + scale(perc_tijdelijk_contract)
          , data = tmp_file) # regr towards the mean
summary(mod)
## 
## Call:
## lm(formula = diff3 ~ 1 + scale(`2017-2018`) + scale(schoolweging) + 
##     scale(perc_man) + scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract), 
##     data = tmp_file)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.193  -7.283   0.181   7.484  44.514 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.87639    0.15157   5.782 7.76e-09 ***
## scale(`2017-2018`)             -10.62431    0.16951 -62.677  < 2e-16 ***
## scale(schoolweging)             -4.34980    0.16846 -25.821  < 2e-16 ***
## scale(perc_man)                  0.08159    0.15703   0.520 0.603403    
## scale(gemiddelde_aanstelling)   -0.59366    0.15957  -3.720 0.000201 ***
## scale(perc_tijdelijk_contract)  -0.32434    0.15743  -2.060 0.039425 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.52 on 5777 degrees of freedom
##   (819 observations deleted due to missingness)
## Multiple R-squared:  0.4057, Adjusted R-squared:  0.4052 
## F-statistic: 788.8 on 5 and 5777 DF,  p-value: < 2.2e-16
# model selection:

mod_step <- step(mod)
## Start:  AIC=28270.15
## diff3 ~ 1 + scale(`2017-2018`) + scale(schoolweging) + scale(perc_man) + 
##     scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract)
## 
##                                  Df Sum of Sq     RSS   AIC
## - scale(perc_man)                 1        36  766156 28268
## <none>                                         766120 28270
## - scale(perc_tijdelijk_contract)  1       563  766683 28272
## - scale(gemiddelde_aanstelling)   1      1836  767956 28282
## - scale(schoolweging)             1     88419  854539 28900
## - scale(`2017-2018`)              1    520964 1287084 31268
## 
## Step:  AIC=28268.42
## diff3 ~ scale(`2017-2018`) + scale(schoolweging) + scale(gemiddelde_aanstelling) + 
##     scale(perc_tijdelijk_contract)
## 
##                                  Df Sum of Sq     RSS   AIC
## <none>                                         766156 28268
## - scale(perc_tijdelijk_contract)  1       576  766732 28271
## - scale(gemiddelde_aanstelling)   1      1801  767957 28280
## - scale(schoolweging)             1     88456  854612 28898
## - scale(`2017-2018`)              1    521875 1288031 31271
summary(mod_step)
## 
## Call:
## lm(formula = diff3 ~ scale(`2017-2018`) + scale(schoolweging) + 
##     scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract), 
##     data = tmp_file)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -51.302  -7.279   0.205   7.498  44.572 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      0.8768     0.1516   5.785 7.63e-09 ***
## scale(`2017-2018`)             -10.6198     0.1693 -62.736  < 2e-16 ***
## scale(schoolweging)             -4.3505     0.1684 -25.828  < 2e-16 ***
## scale(gemiddelde_aanstelling)   -0.5801     0.1574  -3.685 0.000231 ***
## scale(perc_tijdelijk_contract)  -0.3278     0.1573  -2.084 0.037185 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.52 on 5778 degrees of freedom
##   (819 observations deleted due to missingness)
## Multiple R-squared:  0.4057, Adjusted R-squared:  0.4053 
## F-statistic:   986 on 4 and 5778 DF,  p-value: < 2.2e-16

example by Abe

Next up, we show the example provided by Abe. This model corrects for the effect of schoolweging, when predicting the difference in Perc_2f scores.

# --------------------------------------------------------
# play model 2.0 [after client interview]
# predict change in 2016-2017 to 2017-2018
# corrected for schoolweging
# including the score at 2016-2017; to model the regr. towards the mean.

tmp_file_2 <- d_weging %>%
  rename("schoolweging_2016_2017" = "schoolweging") %>%
  filter(schooljaar == "2017-2018") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
  rename("schoolweging_2017_2018" = "schoolweging")

# missing data issues..

tmp_file_clean <- tmp_file_2 %>%
  filter(!is.na(`2016-2017`),
         !is.na(`2017-2018`),
         !is.na(schoolweging_2016_2017),
         !is.na(schoolweging_2017_2018))

mod <- lm(`2016-2017` ~ 1 + schoolweging_2016_2017, data = tmp_file_clean)
tmp_file_clean$res_2016_2017 <- residuals(mod)

mod <- lm(`2017-2018` ~ 1 + schoolweging_2017_2018, data = tmp_file_clean)
tmp_file_clean$res_2017_2018 <- residuals(mod)

tmp_file_clean$diff_corrected_3 <- tmp_file_clean$res_2017_2018 - tmp_file_clean$res_2016_2017 #nolint

# extend the model
mod <- lm(diff_corrected_3 ~ 1
          + scale(res_2016_2017)
          # + scale(schoolweging) #nolint
          + scale(perc_man)
          + scale(gemiddelde_aanstelling)
          + scale(perc_tijdelijk_contract)
          , data = tmp_file_clean) # regr towards the mean
summary(mod)
## 
## Call:
## lm(formula = diff_corrected_3 ~ 1 + scale(res_2016_2017) + scale(perc_man) + 
##     scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract), 
##     data = tmp_file_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.383  -7.727   0.255   7.793  43.761 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.08447    0.15697  -0.538 0.590492    
## scale(res_2016_2017)           -9.83952    0.15724 -62.578  < 2e-16 ***
## scale(perc_man)                 0.58373    0.15913   3.668 0.000246 ***
## scale(gemiddelde_aanstelling)  -0.22331    0.15917  -1.403 0.160674    
## scale(perc_tijdelijk_contract) -0.49114    0.15734  -3.122 0.001808 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.94 on 5780 degrees of freedom
##   (215 observations deleted due to missingness)
## Multiple R-squared:  0.4043, Adjusted R-squared:  0.4039 
## F-statistic: 980.6 on 4 and 5780 DF,  p-value: < 2.2e-16
summary(step(mod))
## Start:  AIC=28696.16
## diff_corrected_3 ~ 1 + scale(res_2016_2017) + scale(perc_man) + 
##     scale(gemiddelde_aanstelling) + scale(perc_tijdelijk_contract)
## 
##                                  Df Sum of Sq     RSS   AIC
## - scale(gemiddelde_aanstelling)   1       281  824125 28696
## <none>                                         823844 28696
## - scale(perc_tijdelijk_contract)  1      1389  825233 28704
## - scale(perc_man)                 1      1918  825762 28708
## - scale(res_2016_2017)            1    558158 1382003 31687
## 
## Step:  AIC=28696.13
## diff_corrected_3 ~ scale(res_2016_2017) + scale(perc_man) + scale(perc_tijdelijk_contract)
## 
##                                  Df Sum of Sq     RSS   AIC
## <none>                                         824125 28696
## - scale(perc_tijdelijk_contract)  1      1454  825579 28704
## - scale(perc_man)                 1      1737  825862 28706
## - scale(res_2016_2017)            1    557928 1382053 31685
## 
## Call:
## lm(formula = diff_corrected_3 ~ scale(res_2016_2017) + scale(perc_man) + 
##     scale(perc_tijdelijk_contract), data = tmp_file_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.318  -7.685   0.294   7.846  43.692 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    -0.08445    0.15698  -0.538 0.590642    
## scale(res_2016_2017)           -9.83248    0.15717 -62.560  < 2e-16 ***
## scale(perc_man)                 0.54864    0.15716   3.491 0.000485 ***
## scale(perc_tijdelijk_contract) -0.50197    0.15716  -3.194 0.001411 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.94 on 5781 degrees of freedom
##   (215 observations deleted due to missingness)
## Multiple R-squared:  0.4041, Adjusted R-squared:  0.4038 
## F-statistic:  1307 on 3 and 5781 DF,  p-value: < 2.2e-16

3. Data exploration

Then, for our first look at the data we did some data exploration. Here we looked at anomalies in the data, as well as distributions and other points of interest that we may use in making our models later on.

We inspect the total dataset d_tot

head(d_tot)
## # A tibble: 6 × 24
##   brin_nummer vestigingsnummer schooljaar n_observaties perc_2F postcode
##   <chr>       <chr>            <chr>              <dbl>   <dbl> <chr>   
## 1 00AP        00               2015-2016             36    66.7 2716 PH 
## 2 00AR        00               2015-2016             51    76.5 6107 AW 
## 3 00AV        00               2015-2016             21    33.3 3201 CN 
## 4 00AZ        00               2015-2016             57    66.7 2971 AR 
## 5 00BA        00               2015-2016             72    65.3 <NA>    
## 6 00BB        00               2015-2016             45    60   9944 AR 
## # … with 18 more variables: plaatsnaam <chr>, gemeentenaam <chr>,
## #   gemeentenummer <chr>, provincie <chr>, gemeente_aantal_inwoners <int>,
## #   gemeente_perc_inwoners_0_tot_15_jaar <dbl>,
## #   gemeente_bevolkingsdichtheid <int>, ratio_leerling_leraar <dbl>,
## #   gemiddelde_aanstelling <dbl>, gemiddelde_leeftijd <dbl>, perc_man <dbl>,
## #   perc_tijdelijk_contract <dbl>, ratio_leraar_ondersteuner <dbl>, jaar <int>,
## #   aantal_leerlingen <chr>, schoolweging <dbl>, …

Next we want to make sure that all the variables are of the correct datatype and if not, we should change that

lapply(d_tot, class)
## $brin_nummer
## [1] "character"
## 
## $vestigingsnummer
## [1] "character"
## 
## $schooljaar
## [1] "character"
## 
## $n_observaties
## [1] "numeric"
## 
## $perc_2F
## [1] "numeric"
## 
## $postcode
## [1] "character"
## 
## $plaatsnaam
## [1] "character"
## 
## $gemeentenaam
## [1] "character"
## 
## $gemeentenummer
## [1] "character"
## 
## $provincie
## [1] "character"
## 
## $gemeente_aantal_inwoners
## [1] "integer"
## 
## $gemeente_perc_inwoners_0_tot_15_jaar
## [1] "numeric"
## 
## $gemeente_bevolkingsdichtheid
## [1] "integer"
## 
## $ratio_leerling_leraar
## [1] "numeric"
## 
## $gemiddelde_aanstelling
## [1] "numeric"
## 
## $gemiddelde_leeftijd
## [1] "numeric"
## 
## $perc_man
## [1] "numeric"
## 
## $perc_tijdelijk_contract
## [1] "numeric"
## 
## $ratio_leraar_ondersteuner
## [1] "numeric"
## 
## $jaar
## [1] "integer"
## 
## $aantal_leerlingen
## [1] "character"
## 
## $schoolweging
## [1] "numeric"
## 
## $schoolweging_spreiding
## [1] "numeric"
## 
## $prop_2F
## [1] "numeric"

We notice that “aantal leerlingen” is a character variable, while it should be numeric. We transform this:

length(which(is.na(d_tot$aantal_leerlingen)))
## [1] 1007
length(which(is.na(as.numeric(d_tot$aantal_leerlingen))))
## Warning in which(is.na(as.numeric(d_tot$aantal_leerlingen))): NAs introduced by
## coercion
## [1] 1008
# one NA is added when transforming to numeric

d_tot <- d_tot %>%
  mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))
## Warning in mask$eval_all_mutate(quo): NAs introduced by coercion
class(d_tot$aantal_leerlingen)
## [1] "numeric"

Up next, we tackle the issue of missing data. In this dataset there are 2 types of missing or wrong data. The first is NA data (missing values), the second is Inf data (which can occur when calculating a proportion and having to divide by 0).

# Number of observations
nrow(d_tot)
## [1] 25718
# remove NAs
d_tot <- d_tot %>%
  drop_na()

# Number of observations after removing NAs
nrow(d_tot)
## [1] 22594

For some further exploration of data we look at some of our variables and their distributions

d_tot %>% count(vestigingsnummer)
## # A tibble: 5 × 2
##   vestigingsnummer     n
##   <chr>            <int>
## 1 00               22249
## 2 01                 289
## 3 02                  38
## 4 03                  15
## 5 04                   3
d_tot %>% count(schooljaar)
## # A tibble: 4 × 2
##   schooljaar     n
##   <chr>      <int>
## 1 2015-2016   5558
## 2 2016-2017   5579
## 3 2017-2018   5552
## 4 2018-2019   5905
d_tot %>% select(perc_2F) %>% summary()
##     perc_2F      
##  Min.   :  0.00  
##  1st Qu.: 50.00  
##  Median : 59.65  
##  Mean   : 58.53  
##  3rd Qu.: 68.33  
##  Max.   :100.00
d_tot %>% select(schoolweging) %>% summary()
##   schoolweging  
##  Min.   :18.90  
##  1st Qu.:27.80  
##  Median :29.80  
##  Mean   :29.91  
##  3rd Qu.:31.90  
##  Max.   :41.51

Now that we have a better understanding of the data, we will be taking a look at some of the distributions. Maybe we can already see if there are any high correlations or remarkable distributions among the quantitative variables

d_tot %>% select(perc_2F, n_observaties, perc_man, aantal_leerlingen,
                 ratio_leraar_ondersteuner, ratio_leerling_leraar,
                 gemeente_perc_inwoners_0_tot_15_jaar,
                 schoolweging_spreiding, gemiddelde_aanstelling) %>%
  ggpairs(aes(alpha = .1))
## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).
## Warning: Removed 1644 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

## Warning: Removed 1 rows containing missing values (geom_text).

Now we look at the individual distributions of the quantitative variables. We start with the normally distributed variables:

d_tot %>% ggplot(aes(prop_2F)) + geom_density()

d_tot %>% ggplot(aes(schoolweging)) + geom_density()

d_tot %>% ggplot(aes(schoolweging_spreiding)) + geom_density()

d_tot %>% ggplot(aes(ratio_leerling_leraar)) + geom_density()

d_tot %>% ggplot(aes(gemiddelde_aanstelling)) + geom_density()

d_tot %>% ggplot(aes(gemiddelde_leeftijd)) + geom_density()

d_tot %>% ggplot(aes(aantal_leerlingen)) + geom_density()

d_tot %>% ggplot(aes(gemeente_perc_inwoners_0_tot_15_jaar)) + geom_density()

There were also a few variables that weren’t normally distributed, these are plotted below:

d_tot %>% ggplot(aes(perc_man)) + geom_density()

d_tot %>% ggplot(aes(perc_tijdelijk_contract)) + geom_density()

d_tot %>% ggplot(aes(ratio_leraar_ondersteuner)) + geom_density()
## Warning: Removed 1644 rows containing non-finite values (stat_density).

d_tot %>% ggplot(aes(gemeente_aantal_inwoners)) + geom_density() # transform

d_tot %>% ggplot(aes(gemeente_bevolkingsdichtheid)) + geom_density() #transform

4. Model fitting

The first thing we noticed when looking at Abe’s data is that he used diff3_corrected to make a calculation for the year which corresponds with diff2. So we corrected this when making the dataframe for the year 2017-2018. We then do some more transformations to make our final training data.

tmp_file_clean$diff_corrected_2 <- tmp_file_clean$res_2017_2018 -
  tmp_file_clean$res_2016_2017

df_train <- tmp_file_clean %>%
  select(- diff_corrected_3)

# Train data frame
df_train <- df_train %>%
  select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
         `2018-2019`, diff1, diff2, diff3, res_2016_2017, res_2017_2018,
         diff_corrected_2)

train_tot <- d_tot %>%
  filter(schooljaar == "2017-2018") %>%
  left_join(df_train, by = c("brin_nummer", "vestigingsnummer")) %>%
  mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))

train_tot <- train_tot[complete.cases(train_tot), ]

train_tot <- train_tot  %>%
  rename("y2015_2016" = "2015-2016",
         "y2016_2017" = "2016-2017",
         "y2017_2018" = "2017-2018",
         "y2018_2019" = "2018-2019") %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))

To create the test data, we essentially create the same data frame as the one made above (by Abe), but this time looking at data from 2018-2019:

# now add some more data:
test_data <- d_personeel_tot %>%
  filter(schooljaar == "2018-2019") %>%
  right_join(tmp_file, by = c("brin_nummer"))

# now add some more data:
test_data <- d_weging %>%
  filter(schooljaar == "2018-2019") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))

test_data <- d_weging %>%
  rename("schoolweging_2017_2018" = "schoolweging") %>%
  filter(schooljaar == "2018-2019") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
  rename("schoolweging_2018_2019" = "schoolweging")

# missing data issues..
clean_test_data <- test_data %>%
  filter(!is.na(`2017-2018`),
         !is.na(`2018-2019`),
         !is.na(schoolweging_2017_2018),
         !is.na(schoolweging_2018_2019))

mod <- lm(`2017-2018` ~ 1 + schoolweging_2017_2018, data = clean_test_data)
clean_test_data$res_2017_2018 <- residuals(mod)

mod <- lm(`2018-2019` ~ 1 + schoolweging_2018_2019, data = clean_test_data)
clean_test_data$res_2018_2019 <- residuals(mod)

clean_test_data$diff_corrected_3 <- clean_test_data$res_2018_2019 -
  clean_test_data$res_2017_2018

df_test <- clean_test_data %>%
  select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
         `2018-2019`, diff1, diff2, diff3, res_2017_2018, res_2018_2019,
         diff_corrected_3)

test_tot <- d_tot %>%
  filter(schooljaar == "2018-2019") %>%
  left_join(df_test, by = c("brin_nummer", "vestigingsnummer")) %>%
  mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))

test_tot <- test_tot[complete.cases(test_tot), ]

test_tot <- test_tot  %>%
  rename("y2015_2016" = "2015-2016",
         "y2016_2017" = "2016-2017",
         "y2017_2018" = "2017-2018",
         "y2018_2019" = "2018-2019") %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))

Now that we have our training data and our test data, we will see if we need to delete any columns due to a variance of nearly 0.

getNearZeroVariationPredictors <- function(df) {

  nzv_predictors <- df %>%
    nearZeroVar(names = TRUE)

  if (length(nzv_predictors) == 0) {
    return(0L)
  } else {
    return(nzv_predictors)
  }
}

nzv_train <- getNearZeroVariationPredictors(train_tot)
nzv_train
## [1] "vestigingsnummer" "schooljaar"       "jaar"

It appears there are some variables which we should delete. However, these are not variables we intend to use in our models. So we choose to ignore the outcome here. Next, we look at highly correlated columns.

high_cor <- train_tot %>%
    select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
              plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer)) %>%
    cor() %>%
    findCorrelation(.9, names = TRUE)
  
high_cor
## [1] "perc_2F" "prop_2F" "diff2"

We now make a dataframe which we can use to create our models. First the test data (from 2017-2018).

train_model <- train_tot %>%
  select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
            plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
            diff2, diff3,
            # remove variables with high correlation
            perc_2F, y2015_2016,
            # these predictors make the r-squared 1
            y2017_2018, y2018_2019, y2016_2017,
            res_2016_2017, res_2017_2018))

colnames(train_model)
##  [1] "n_observaties"                       
##  [2] "gemeente_aantal_inwoners"            
##  [3] "gemeente_perc_inwoners_0_tot_15_jaar"
##  [4] "gemeente_bevolkingsdichtheid"        
##  [5] "ratio_leerling_leraar"               
##  [6] "gemiddelde_aanstelling"              
##  [7] "gemiddelde_leeftijd"                 
##  [8] "perc_man"                            
##  [9] "perc_tijdelijk_contract"             
## [10] "ratio_leraar_ondersteuner"           
## [11] "aantal_leerlingen"                   
## [12] "schoolweging"                        
## [13] "schoolweging_spreiding"              
## [14] "prop_2F"                             
## [15] "diff1"                               
## [16] "diff_corrected_2"

As well as the test data from 2018-2019. We rename diff2 to diff1 so it matches the column names of the training data. But we know it to be diff2.

test_model <- test_tot %>%
  select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
            plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
            diff1, diff3,
            # remove variables with high correlation
            perc_2F, y2015_2016,
            # these predictors make the r-squared 1
            y2017_2018, y2018_2019, y2016_2017,
            res_2017_2018, res_2018_2019)) %>%
  rename("diff1" = "diff2")

colnames(test_model)
##  [1] "n_observaties"                       
##  [2] "gemeente_aantal_inwoners"            
##  [3] "gemeente_perc_inwoners_0_tot_15_jaar"
##  [4] "gemeente_bevolkingsdichtheid"        
##  [5] "ratio_leerling_leraar"               
##  [6] "gemiddelde_aanstelling"              
##  [7] "gemiddelde_leeftijd"                 
##  [8] "perc_man"                            
##  [9] "perc_tijdelijk_contract"             
## [10] "ratio_leraar_ondersteuner"           
## [11] "aantal_leerlingen"                   
## [12] "schoolweging"                        
## [13] "schoolweging_spreiding"              
## [14] "prop_2F"                             
## [15] "diff1"                               
## [16] "diff_corrected_3"

Linear model

For our linear model we will be using a forward stepwise approach. We first define a nullmodel and a full model.

nullmod <- lm(diff_corrected_2 ~ 1, data = train_model)
fullmod <- lm(diff_corrected_2 ~., data = train_model)
summary(fullmod)
## 
## Call:
## lm(formula = diff_corrected_2 ~ ., data = train_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.690  -6.551  -0.602   6.156  63.539 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -7.786e+01  3.299e+00 -23.601  < 2e-16 ***
## n_observaties                        -7.157e-03  6.212e-03  -1.152  0.24935    
## gemeente_aantal_inwoners             -8.877e-07  1.106e-06  -0.802  0.42236    
## gemeente_perc_inwoners_0_tot_15_jaar -1.295e-01  7.751e-02  -1.671  0.09481 .  
## gemeente_bevolkingsdichtheid          4.004e-04  1.367e-04   2.930  0.00341 ** 
## ratio_leerling_leraar                -1.443e-01  4.692e-02  -3.075  0.00211 ** 
## gemiddelde_aanstelling                1.213e+00  1.849e+00   0.656  0.51183    
## gemiddelde_leeftijd                  -5.438e-02  3.273e-02  -1.661  0.09672 .  
## perc_man                             -1.753e-02  1.482e-02  -1.183  0.23669    
## perc_tijdelijk_contract               2.214e-03  1.590e-02   0.139  0.88924    
## ratio_leraar_ondersteuner            -2.904e-03  6.627e-03  -0.438  0.66120    
## aantal_leerlingen                    -8.720e-04  2.377e-03  -0.367  0.71376    
## schoolweging                          1.089e+00  4.452e-02  24.461  < 2e-16 ***
## schoolweging_spreiding                8.224e-01  1.856e-01   4.430  9.6e-06 ***
## prop_2F                               7.918e+01  1.154e+00  68.622  < 2e-16 ***
## diff1                                -3.955e-01  8.590e-03 -46.046  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.22 on 5368 degrees of freedom
## Multiple R-squared:  0.5492, Adjusted R-squared:  0.5479 
## F-statistic:   436 on 15 and 5368 DF,  p-value: < 2.2e-16

Then we use the step function to obtain the best model.

finalmod <- MASS::stepAIC(nullmod,
                              scope = list(upper = fullmod),
                              direction = "forward", trace = FALSE)

finalmod
## 
## Call:
## lm(formula = diff_corrected_2 ~ prop_2F + diff1 + schoolweging + 
##     schoolweging_spreiding + ratio_leerling_leraar + gemeente_bevolkingsdichtheid + 
##     n_observaties + gemiddelde_leeftijd + gemeente_perc_inwoners_0_tot_15_jaar, 
##     data = train_model)
## 
## Coefficients:
##                          (Intercept)                               prop_2F  
##                           -7.725e+01                             7.914e+01  
##                                diff1                          schoolweging  
##                           -3.957e-01                             1.091e+00  
##               schoolweging_spreiding                 ratio_leerling_leraar  
##                            8.245e-01                            -1.414e-01  
##         gemeente_bevolkingsdichtheid                         n_observaties  
##                            3.312e-04                            -9.170e-03  
##                  gemiddelde_leeftijd  gemeente_perc_inwoners_0_tot_15_jaar  
##                           -5.839e-02                            -1.314e-01
summary(finalmod)
## 
## Call:
## lm(formula = diff_corrected_2 ~ prop_2F + diff1 + schoolweging + 
##     schoolweging_spreiding + ratio_leerling_leraar + gemeente_bevolkingsdichtheid + 
##     n_observaties + gemiddelde_leeftijd + gemeente_perc_inwoners_0_tot_15_jaar, 
##     data = train_model)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.556  -6.614  -0.623   6.139  63.464 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -7.725e+01  2.973e+00 -25.979  < 2e-16 ***
## prop_2F                               7.914e+01  1.151e+00  68.729  < 2e-16 ***
## diff1                                -3.957e-01  8.581e-03 -46.111  < 2e-16 ***
## schoolweging                          1.091e+00  4.342e-02  25.132  < 2e-16 ***
## schoolweging_spreiding                8.245e-01  1.847e-01   4.465 8.19e-06 ***
## ratio_leerling_leraar                -1.414e-01  4.600e-02  -3.074 0.002121 ** 
## gemeente_bevolkingsdichtheid          3.313e-04  9.593e-05   3.453 0.000558 ***
## n_observaties                        -9.170e-03  3.014e-03  -3.043 0.002356 ** 
## gemiddelde_leeftijd                  -5.839e-02  3.093e-02  -1.888 0.059113 .  
## gemeente_perc_inwoners_0_tot_15_jaar -1.314e-01  7.571e-02  -1.735 0.082775 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.21 on 5374 degrees of freedom
## Multiple R-squared:  0.549,  Adjusted R-squared:  0.5482 
## F-statistic: 726.9 on 9 and 5374 DF,  p-value: < 2.2e-16

We Use this model to predict the test data and obtain the RMSE. We also plot the predicted data and the observed data.

pred_lm <- predict(finalmod, newdata = test_model)

RMSE_lm <- RMSE(pred = pred_lm, obs = test_model$diff_corrected_3)
RMSE_lm
## [1] 9.466832
tibble(pred_lm = pred_lm, diff_corrected_3 = test_model$diff_corrected_3) %>%
  ggplot(aes(x = pred_lm, y = diff_corrected_3)) +
  geom_point(alpha = .2) +
  geom_abline() +
  labs(title = "Predictions vs Observations", y = "Corrected Difference of
       Observations (2018/2019 - 2017/2018)", x = "LM Predictions",
       subtitle = "Using data to predict 1 year into the future",
       caption = "Data Source: Inspectorate of Education") +
  theme_minimal() +
    geom_text(x = 30, y = -30, size = 4,
    label = paste("RMSE = ", round(RMSE_lm, 3)))

Assumption check

fit <- finalmod
  1. A linear relationship between the dependent and independent variables
  • expected values of prediction errors is 0 for all i: E(ε|x1,…,xp) = 0
plot(fit, which = 1) # should be red straight line at 0

res <- lmtest::resettest(fit) # if significant --> assumption violated
res
## 
##  RESET test
## 
## data:  fit
## RESET = 8.8826, df1 = 2, df2 = 5372, p-value = 0.0001408
round(res$statistic, 5)
##   RESET 
## 8.88257
round(res$p.value, 5)
## [1] 0.00014
  1. The independent variables are not highly correlated with each other
  • we removed highly correlated predictors
  1. The variance of the residuals is constant
  • Homoscedasticity of the residuals (that is all ε have the same variance).
plot(fit, which = 3) # scale-location plot -> should be red straight line at 1

ncv <- car::ncvTest(fit) # if significant --> assumption violated
ncv
## Non-constant Variance Score Test 
## Variance formula: ~ fitted.values 
## Chisquare = 0.2477182, Df = 1, p = 0.61869
round(ncv$ChiSquare, 5)
## [1] 0.24772
round(ncv$p, 5)
## [1] 0.61869
  1. The residuals are follow a normal distribution
  • we cannot do the shapiro test because it requires a sample size between 3 and 5000
plot(fit, which = 2)

#shp <- shapiro.test(resid(fit)) # if significant assumption is violated
  1. Multivariate normality
round(car::vif(fit), 5) # > 10 suspicious
##                              prop_2F                                diff1 
##                              1.24697                              1.00525 
##                         schoolweging               schoolweging_spreiding 
##                              1.38868                              1.09715 
##                ratio_leerling_leraar         gemeente_bevolkingsdichtheid 
##                              1.15799                              1.17730 
##                        n_observaties                  gemiddelde_leeftijd 
##                              1.21692                              1.02232 
## gemeente_perc_inwoners_0_tot_15_jaar 
##                              1.04909

Outliers

in IV space

plot(fit, which = 5) # Residuals vs Leverage plot
abline(v = 2 * mean(hatvalues(fit)), col = 3, lty = 3) 

sum(hatvalues(fit) > 2 * mean(hatvalues(fit))) 
## [1] 344
outliers_iv_index <- which(hatvalues(fit) > 2 * mean(hatvalues(fit))) 

in DV space

sum(abs(rstandard(fit)) > 2)
## [1] 261
outliers_dv_index <- which(abs(rstandard(fit)) > 2)

both: cooks distance

sum(cooks.distance(fit) > 0.5)
## [1] 0

Fit model with outliers removed and compare it with the original model.

outliers_removed <- train_model[-c(outliers_iv_index, outliers_dv_index),]
fit_outliers_removed <- lm(diff_corrected_2 ~., data = outliers_removed)
summary(fit_outliers_removed)
## 
## Call:
## lm(formula = diff_corrected_2 ~ ., data = outliers_removed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.1071  -5.9516  -0.4077   5.9053  22.3616 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.173e+01  3.129e+00 -26.116  < 2e-16 ***
## n_observaties                        -2.487e-03  6.057e-03  -0.411   0.6814    
## gemeente_aantal_inwoners             -8.558e-07  1.020e-06  -0.839   0.4015    
## gemeente_perc_inwoners_0_tot_15_jaar -8.704e-02  7.379e-02  -1.180   0.2382    
## gemeente_bevolkingsdichtheid          3.188e-04  1.297e-04   2.457   0.0140 *  
## ratio_leerling_leraar                -1.908e-01  4.766e-02  -4.003 6.34e-05 ***
## gemiddelde_aanstelling                8.688e-01  1.673e+00   0.519   0.6037    
## gemiddelde_leeftijd                  -3.030e-02  2.976e-02  -1.018   0.3088    
## perc_man                             -2.267e-02  1.342e-02  -1.690   0.0911 .  
## perc_tijdelijk_contract              -3.471e-03  1.457e-02  -0.238   0.8117    
## ratio_leraar_ondersteuner            -5.893e-04  5.725e-03  -0.103   0.9180    
## aantal_leerlingen                    -6.438e-04  2.320e-03  -0.278   0.7814    
## schoolweging                          1.192e+00  4.190e-02  28.453  < 2e-16 ***
## schoolweging_spreiding                5.757e-01  1.812e-01   3.176   0.0015 ** 
## prop_2F                               8.071e+01  1.064e+00  75.849  < 2e-16 ***
## diff1                                -4.049e-01  8.179e-03 -49.498  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.626 on 4788 degrees of freedom
## Multiple R-squared:  0.6191, Adjusted R-squared:  0.6179 
## F-statistic: 518.8 on 15 and 4788 DF,  p-value: < 2.2e-16

Lasso model

We now look at a less flexible model, the lasso regression. Before we make a lasso model, we need to create an X matrix and a y outcome vector.

## Train data
y <- train_model$diff_corrected_2
X <- train_model %>%
  select(- diff_corrected_2) %>%
  as.matrix()

## Test data
y_test <- test_model$diff_corrected_3
X_test <- test_model %>%
  select(- diff_corrected_3) %>%
  as.matrix()
lasso_fit <- cv.glmnet(X, y,
                    family = "gaussian", trace.it = 1, nfolds = 5,
                    intercept = F, alpha = 1)
## Training
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
## Fold: 1/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
## Fold: 2/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |======================================================================| 100%
## Fold: 3/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |======================================================================| 100%
## Fold: 4/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |======================================================================| 100%
## Fold: 5/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |======================================================================| 100%
print(lasso_fit)
## 
## Call:  cv.glmnet(x = X, y = y, nfolds = 5, trace.it = 1, family = "gaussian",      intercept = F, alpha = 1) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.0238    64   115.6 3.531      13
## 1se 0.5122    31   119.0 3.409      10

We use this model to predict the test set and calculate the RMSE. We also added a plot of the predicted and observed scores.

pred_lasso <- predict(lasso_fit, newx = X_test, s = "lambda.min")


RMSE_lasso <- RMSE(pred = pred_lasso, obs = y_test)
RMSE_lasso
## [1] 10.07046
plot(pred_lasso, y_test)
abline(0, 1)

Ridge regression model

Lastly we look at fitting a ridge model. It is similar to a lasso regression, with the difference being in the alpha constraint.

ridge_fit <- cv.glmnet(X, y,
                    family = "gaussian", trace.it = 1, nfolds = 5,
                    alpha = 0, intercept = F)
## Training
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Fold: 1/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Fold: 2/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Fold: 3/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Fold: 4/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
## Fold: 5/5
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |==============================================                        |  65%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |================================================                      |  68%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |=================================================                     |  70%
  |                                                                            
  |==================================================                    |  71%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |====================================================                  |  74%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |=====================================================                 |  76%
  |                                                                            
  |======================================================                |  77%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |=======================================================               |  79%
  |                                                                            
  |========================================================              |  80%
  |                                                                            
  |=========================================================             |  81%
  |                                                                            
  |=========================================================             |  82%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |===========================================================           |  84%
  |                                                                            
  |============================================================          |  85%
  |                                                                            
  |============================================================          |  86%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |==============================================================        |  88%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |===============================================================       |  90%
  |                                                                            
  |================================================================      |  91%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |==================================================================    |  95%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |====================================================================  |  97%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |===================================================================== |  99%
  |                                                                            
  |======================================================================| 100%
print(ridge_fit)
## 
## Call:  cv.glmnet(x = X, y = y, nfolds = 5, trace.it = 1, family = "gaussian",      alpha = 0, intercept = F) 
## 
## Measure: Mean-Squared Error 
## 
##     Lambda Index Measure    SE Nonzero
## min 0.8347   100   116.2 3.480      15
## 1se 2.5490    88   119.2 3.353      15

We use this model to predict the test data of 2018-2019. We also calculate the RMSE and plot the predicted scores and observerd scores.

pred_ridge <- predict(ridge_fit, X_test)


RMSE_ridge <- RMSE(pred = pred_ridge, obs = y_test)
RMSE_ridge
## [1] 10.29709
plot(pred_ridge, y_test)
abline(0, 1)

Classification tree

We also fit a classification tree. This is a model that is more flexible than an lm model.

trCntrl <- trainControl("cv", 5, allowParallel = TRUE)

fit_tree <- caret::train(x = X, y = y, method = "rpart", trControl = trCntrl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
fit_tree
## CART 
## 
## 5384 samples
##   15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4308, 4306, 4306, 4308, 4308 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.06070607  12.86789  0.2833116  10.03975
##   0.06430842  13.38301  0.2245544  10.44370
##   0.19404799  14.33378  0.1819956  11.20758
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.06070607.

We now fit this tree model to the test data.

pred_tree <- predict(fit_tree, newdata = X_test)

RMSE_tree <- RMSE(pred = pred_tree, obs = y_test)
RMSE_tree
## [1] 12.90546
plot(pred_tree, y_test)
abline(0, 1)

Random forest

We now look at a different classification model: The random forest. Here we have to specify some parameters. For the mtry parameter we use the recommended amount, the sqrt() of the number of columns of the dataframe. For splitrule we use variance and minimal node size we set to 1.

mtry_ncol <- sqrt(ncol(X))

## Fit Random forrest using Caret::Train
fit_rf <- train(X, y, method = "ranger",
    trControl = trCntrl,
    tuneGrid = expand.grid(mtry = mtry_ncol,
                           splitrule = "variance", min.node.size = 1)
)
fit_rf$finalModel
## Ranger result
## 
## Call:
##  ranger::ranger(dependent.variable.name = ".outcome", data = x,      mtry = min(param$mtry, ncol(x)), min.node.size = param$min.node.size,      splitrule = as.character(param$splitrule), write.forest = TRUE,      probability = classProbs, ...) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      5384 
## Number of independent variables:  15 
## Mtry:                             3 
## Target node size:                 1 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       111.9331 
## R squared (OOB):                  0.5152332

We use this model to predict and calculate the RMSE. We then show the prediction accuracy in a plot.

pred_rf <- predict(fit_rf, newdata = X_test)

RMSE_rf <- RMSE(pred = pred_rf, obs = y_test)
RMSE_rf
## [1] 10.19637
plot(pred_rf, y_test)
abline(0, 1)

Now that we have our various models, it is time to compare. We will be looking to see which of the models has the lowest RMSE - indicating the best predictions on the data from a year later.

RMSE_df <- data.frame(models = c("lm", "lasso",
                                 "ridge", "decision tree", "random forest"),
           RMSE = c(RMSE_lm, RMSE_lasso, RMSE_ridge, RMSE_tree, RMSE_rf))

RMSE_df <- RMSE_df %>%
  mutate(min = min(RMSE) == RMSE)

## Plot showing best performance
RMSE_df %>%
  arrange(RMSE) %>%
  ggplot(aes(x = models, y = RMSE, fill = min)) +
    geom_bar(stat = "identity", show.legend = FALSE) +
    labs(y = "RMSE",
         x = "",
         title = "Model comparison of test prediction accuracy",
         subtitle = "using Root Mean Squared Error (RMSE) to compare",
         caption = "Best model displayed in Red                                                                                         Data source: Inspectorate of Education") + #nolint
    coord_flip(ylim = c(0, 15)) +
    theme_minimal() +
    scale_fill_manual(values = c("black", "Red")) +
    geom_text(aes(label = round(RMSE, 3)), hjust = -0.5)

From this we can conclude that the best model to predict future data is the linear model using forward stepwise selection.

Bonus

To answer another question that came up: How far ahead can we predict, using these models.

We will be answering this question by looking if our best performing model (approach) can be used to predict data 2 years ahead, instead of 1 year only. We first make new training data - using the same approach as before, but now for 2016-2017.

# now add some more data:
train_data2 <- d_personeel_tot %>%
  filter(schooljaar == "2016-2017") %>%
  right_join(tmp_file, by = c("brin_nummer"))

# now add some more data:
train_data2 <- d_weging %>%
  filter(schooljaar == "2016-2017") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer"))

train_data2 <- d_weging %>%
  rename("schoolweging_2015_2016" = "schoolweging") %>%
  filter(schooljaar == "2016-2017") %>%
  right_join(tmp_file, by = c("brin_nummer", "vestigingsnummer")) %>%
  rename("schoolweging_2016_2017" = "schoolweging")

# missing data issues..
clean_train_data2 <- train_data2 %>%
  filter(!is.na(`2015-2016`),
         !is.na(`2016-2017`),
         !is.na(schoolweging_2015_2016),
         !is.na(schoolweging_2016_2017))

mod <- lm(`2015-2016` ~ 1 + schoolweging_2015_2016, data = clean_train_data2)
clean_train_data2$res_2015_2016 <- residuals(mod)

mod <- lm(`2016-2017` ~ 1 + schoolweging_2016_2017, data = clean_train_data2)
clean_train_data2$res_2016_2017 <- residuals(mod)

clean_train_data2$diff_corrected_1 <- clean_train_data2$res_2016_2017 -
  clean_train_data2$res_2015_2016

df_train2 <- clean_train_data2 %>%
  select(brin_nummer, vestigingsnummer, `2015-2016`, `2016-2017`, `2017-2018`,
         `2018-2019`, diff1, diff2, diff3, res_2015_2016, res_2016_2017,
         diff_corrected_1)

train_tot2 <- d_tot %>%
  filter(schooljaar == "2016-2017") %>%
  left_join(df_train2, by = c("brin_nummer", "vestigingsnummer")) %>%
  mutate(aantal_leerlingen = as.numeric(aantal_leerlingen))

train_tot2 <- train_tot2[complete.cases(train_tot2), ]

train_tot2 <- train_tot2  %>%
  rename("y2015_2016" = "2015-2016",
         "y2016_2017" = "2016-2017",
         "y2017_2018" = "2017-2018",
         "y2018_2019" = "2018-2019") %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == Inf, 0, .)) %>%
  mutate_if(is.numeric, ~ifelse(abs(.) == -Inf, 0, .))

train_model2 <- train_tot2 %>%
  select(-c(brin_nummer, vestigingsnummer, schooljaar, postcode,
            plaatsnaam, gemeentenaam, provincie, jaar, gemeentenummer,
            diff2, diff3, diff1,
            # remove variables with high correlation
            perc_2F, y2015_2016,
            # these predictors make the r-squared 1
            y2017_2018, y2018_2019, y2016_2017,
            res_2015_2016, res_2016_2017))

colnames(train_model2)
##  [1] "n_observaties"                       
##  [2] "gemeente_aantal_inwoners"            
##  [3] "gemeente_perc_inwoners_0_tot_15_jaar"
##  [4] "gemeente_bevolkingsdichtheid"        
##  [5] "ratio_leerling_leraar"               
##  [6] "gemiddelde_aanstelling"              
##  [7] "gemiddelde_leeftijd"                 
##  [8] "perc_man"                            
##  [9] "perc_tijdelijk_contract"             
## [10] "ratio_leraar_ondersteuner"           
## [11] "aantal_leerlingen"                   
## [12] "schoolweging"                        
## [13] "schoolweging_spreiding"              
## [14] "prop_2F"                             
## [15] "diff_corrected_1"

We Use this model to train a new linear model.

nullmod2 <- lm(diff_corrected_1 ~ 1, data = train_model2)
fullmod2 <- lm(diff_corrected_1 ~., data = train_model2)
summary(fullmod2)
## 
## Call:
## lm(formula = diff_corrected_1 ~ ., data = train_model2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.000  -8.844  -1.356   7.688  72.816 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          -8.319e+01  4.664e+00 -17.836  < 2e-16 ***
## n_observaties                        -2.606e-03  8.213e-03  -0.317 0.750991    
## gemeente_aantal_inwoners              3.597e-07  1.504e-06   0.239 0.811006    
## gemeente_perc_inwoners_0_tot_15_jaar  5.247e-02  1.027e-01   0.511 0.609373    
## gemeente_bevolkingsdichtheid         -2.396e-04  1.871e-04  -1.281 0.200376    
## ratio_leerling_leraar                -1.059e-01  6.327e-02  -1.673 0.094386 .  
## gemiddelde_aanstelling                2.923e+00  2.604e+00   1.123 0.261693    
## gemiddelde_leeftijd                  -1.476e-02  4.464e-02  -0.331 0.740909    
## perc_man                             -1.663e-02  2.009e-02  -0.828 0.407839    
## perc_tijdelijk_contract               9.132e-02  2.272e-02   4.019 5.93e-05 ***
## ratio_leraar_ondersteuner            -7.518e-03  6.049e-03  -1.243 0.213947    
## aantal_leerlingen                    -1.666e-03  3.206e-03  -0.520 0.603356    
## schoolweging                          1.296e+00  6.886e-02  18.821  < 2e-16 ***
## schoolweging_spreiding                8.318e-01  2.465e-01   3.374 0.000745 ***
## prop_2F                               7.056e+01  1.530e+00  46.131  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.78 on 5387 degrees of freedom
## Multiple R-squared:  0.2875, Adjusted R-squared:  0.2857 
## F-statistic: 155.3 on 14 and 5387 DF,  p-value: < 2.2e-16
finalmod2 <- MASS::stepAIC(nullmod2,
                              scope = list(upper = fullmod2),
                              direction = "forward", trace = FALSE)

finalmod2
## 
## Call:
## lm(formula = diff_corrected_1 ~ prop_2F + schoolweging + perc_tijdelijk_contract + 
##     schoolweging_spreiding + aantal_leerlingen + ratio_leerling_leraar, 
##     data = train_model2)
## 
## Coefficients:
##             (Intercept)                  prop_2F             schoolweging  
##              -82.213529                70.714035                 1.316669  
## perc_tijdelijk_contract   schoolweging_spreiding        aantal_leerlingen  
##                0.090929                 0.778440                -0.002867  
##   ratio_leerling_leraar  
##               -0.090165
summary(finalmod2)
## 
## Call:
## lm(formula = diff_corrected_1 ~ prop_2F + schoolweging + perc_tijdelijk_contract + 
##     schoolweging_spreiding + aantal_leerlingen + ratio_leerling_leraar, 
##     data = train_model2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -54.395  -8.928  -1.376   7.695  72.210 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -82.213529   3.347211 -24.562  < 2e-16 ***
## prop_2F                  70.714035   1.524110  46.397  < 2e-16 ***
## schoolweging              1.316669   0.066531  19.790  < 2e-16 ***
## perc_tijdelijk_contract   0.090929   0.021318   4.265 2.03e-05 ***
## schoolweging_spreiding    0.778440   0.237846   3.273  0.00107 ** 
## aantal_leerlingen        -0.002867   0.001431  -2.003  0.04520 *  
## ratio_leerling_leraar    -0.090165   0.061985  -1.455  0.14583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.77 on 5395 degrees of freedom
## Multiple R-squared:  0.2868, Adjusted R-squared:  0.286 
## F-statistic: 361.6 on 6 and 5395 DF,  p-value: < 2.2e-16

As we can see this model - with a R-squared of .29 - explains a lot less variance in the outcome variable than the previous model - with a R-squared of .55. A possible explanation for this is the fact that the first model used the difference score of the previous year as one of its predictor variables, something that cannot be used for this model using the first possible difference score.

We will now try to predict the scores 2 years later, using this model.

pred_lm2 <- predict(finalmod2, newdata = test_model)

RMSE_lm2 <- RMSE(pred = pred_lm2, obs = test_model$diff_corrected_3)
RMSE_lm2
## [1] 12.50975
tibble(pred_lm = pred_lm2, diff_corrected_3 = test_model$diff_corrected_3) %>%
  ggplot(aes(x = pred_lm, y = diff_corrected_3)) +
  geom_point(alpha = .2) +
  geom_abline() +
  labs(title = "Predictions vs Observations", y = "Corrected Difference of
       Observations (2018/2019 - 2017/2018)", x = "LM Predictions",
       subtitle = "Using data to predict 2 years into the future",
       caption = "Data Source: Inspectorate of Education") +
  theme_minimal() +
  geom_text(x = 30, y = -30, size = 4,
    label = paste("RMSE = ", round(RMSE_lm2, 3)))

As we can see the RMSE is quite a bit lower for data predicted two years ahead. It scored just above the decision tree (which used a categorical classification model, so it is natural that it performs worse than a continuous prediction model).

Checking code style

lintr::lint(filename = rstudioapi::getSourceEditorContext()$path)
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:635:1: style: Variable and function name style should be snake_case.
## getNearZeroVariationPredictors <- function(df) {
## ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:716:1: style: Variable and function name style should be snake_case.
## RMSE_lm <- RMSE(pred = pred_lm, obs = test_model$diff_corrected_3)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:775:2: style: Commented code should be removed.
## #shp <- shapiro.test(resid(fit)) # if significant assumption is violated
##  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:792:55: style: Trailing whitespace is superfluous.
## abline(v = 2 * mean(hatvalues(fit)), col = 3, lty = 3) 
##                                                       ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:793:47: style: Trailing whitespace is superfluous.
## sum(hatvalues(fit) > 2 * mean(hatvalues(fit))) 
##                                               ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:794:70: style: Trailing whitespace is superfluous.
## outliers_iv_index <- which(hatvalues(fit) > 2 * mean(hatvalues(fit))) 
##                                                                      ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:816:74: style: Commas should always have a space after.
## outliers_removed <- train_model[-c(outliers_iv_index, outliers_dv_index),]
##                                                                          ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:828:1: style: Variable and function name style should be snake_case.
## X <- train_model %>%
## ^
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:834:1: style: Variable and function name style should be snake_case.
## X_test <- test_model %>%
## ^~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:853:1: style: Variable and function name style should be snake_case.
## RMSE_lasso <- RMSE(pred = pred_lasso, obs = y_test)
## ^~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:877:1: style: Variable and function name style should be snake_case.
## RMSE_ridge <- RMSE(pred = pred_ridge, obs = y_test)
## ^~~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:888:1: style: Variable and function name style should be snake_case.
## trCntrl <- trainControl("cv", 5, allowParallel = TRUE)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:898:1: style: Variable and function name style should be snake_case.
## RMSE_tree <- RMSE(pred = pred_tree, obs = y_test)
## ^~~~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:924:1: style: Variable and function name style should be snake_case.
## RMSE_rf <- RMSE(pred = pred_rf, obs = y_test)
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:933:1: style: Variable and function name style should be snake_case.
## RMSE_df <- data.frame(models = c("lm", "lasso",
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:937:1: style: Variable and function name style should be snake_case.
## RMSE_df <- RMSE_df %>%
## ^~~~~~~
## /Users/jessica/Downloads/Pressure_cooker_final.Rmd:1052:1: style: Variable and function name style should be snake_case.
## RMSE_lm2 <- RMSE(pred = pred_lm2, obs = test_model$diff_corrected_3)
## ^~~~~~~~